In [65]:
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

In [66]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [67]:
# Remove outliers
data_dict.pop('TOTAL')
data_dict.pop('THE TRAVEL AGENCY IN THE PARK') # Not a name
data_dict.pop('LOCKHART EUGENE E') # All data is NaN
my_dataset = data_dict

In [68]:
# compute fraction
def computeFraction( poi_messages, all_messages ):
    """ given a number messages to/from POI (numerator) 
        and number of all messages to/from a person (denominator),
        return the fraction of messages to/from that person
        that are from/to a POI
   """


    ### you fill in this code, so that it returns either
    ###     the fraction of all messages to this person that come from POIs
    ###     or
    ###     the fraction of all messages from this person that are sent to POIs
    ### the same code can be used to compute either quantity

    ### beware of "NaN" when there is no known email address (and so
    ### no filled email features), and integer division!
    ### in case of poi_messages or all_messages having "NaN" value, return 0.
    if poi_messages == 'NaN' or all_messages == 'NaN':
        fraction = 0
    else:
        fraction = poi_messages *1.0 / all_messages



    return fraction

In [69]:
# add the new features to dataset
for name in my_dataset:
    data_point = my_dataset[name]
    from_poi_to_this_person = data_point["from_poi_to_this_person"]
    to_messages = data_point["to_messages"]
    fraction_from_poi = computeFraction( from_poi_to_this_person, to_messages )
    data_point["fraction_from_poi"] = fraction_from_poi
    
    from_this_person_to_poi = data_point["from_this_person_to_poi"]
    from_messages = data_point["from_messages"]
    fraction_to_poi = computeFraction( from_this_person_to_poi, from_messages )

    data_point["fraction_to_poi"] = fraction_to_poi

In [70]:
print 'Total # of people:',len(my_dataset)


Total # of people: 143

In [71]:
print 'Total # of features:',len(my_dataset[my_dataset.keys()[0]])


Total # of features: 23

In [72]:
# poi and non-poi
positive=0
negative=0
for name in my_dataset:
    if my_dataset[name]['poi']==True:
        positive+=1
    elif my_dataset[name]['poi']==False:
        negative+=1

print 'poi: ',positive
print 'non-poi: ',negative


poi:  18
non-poi:  125

In [73]:
# compute nan values
from collections import defaultdict
na_count=defaultdict(int)
for name in my_dataset:
    for feature in my_dataset[name]:
        if my_dataset[name][feature]=='NaN':
            na_count[feature]+=1
print na_count


defaultdict(<type 'int'>, {'salary': 49, 'to_messages': 57, 'deferral_payments': 105, 'total_payments': 20, 'exercised_stock_options': 42, 'bonus': 62, 'restricted_stock': 34, 'restricted_stock_deferred': 126, 'total_stock_value': 18, 'director_fees': 127, 'from_poi_to_this_person': 57, 'loan_advances': 140, 'from_messages': 57, 'other': 52, 'expenses': 49, 'from_this_person_to_poi': 57, 'deferred_income': 95, 'shared_receipt_with_poi': 57, 'email_address': 32, 'long_term_incentive': 78})

tester.py


In [74]:
import pickle
import sys
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit

PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"

def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."

DecisionTreeClassifier for all features


In [75]:
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import StratifiedShuffleSplit

features_list = ['poi','salary',  'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred', 'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'other', 'long_term_incentive', 'restricted_stock', 'director_fees','to_messages',  'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi','fraction_from_poi','fraction_to_poi']
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [76]:
param_grid = {'max_depth':[2,3,4,5,6,7,8],
          'min_samples_split': [2,5,8,10,12,15,20],
          'min_samples_leaf':[1,2,5,8,10],
          }
cv = StratifiedShuffleSplit(labels, 100, random_state = 42)

clf_grid = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid,cv=cv,scoring='f1')

In [77]:
from time import time
t0=time()
clf_grid.fit(features,labels)
print "Fitting done in %0.3fs" % (time() - t0)


Fitting done in 84.134s

In [78]:
print "Best estimator found by grid search:"
print clf_grid.best_estimator_
clf=clf_grid.best_estimator_
test_classifier(clf, my_dataset, features_list, folds = 1000)
clf.feature_importances_


Best estimator found by grid search:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=15, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=15, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.82893	Precision: 0.34119	Recall: 0.30400	F1: 0.32152	F2: 0.31077
	Total predictions: 15000	True positives:  608	False positives: 1174	False negatives: 1392	True negatives: 11826

Out[78]:
array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.08330184,  0.17014983,  0.40804035,
        0.        ,  0.        ,  0.04565957,  0.        ,  0.        ,
        0.        ,  0.11832326,  0.        ,  0.        ,  0.        ,
        0.17452516])

DecisionTree for the important features


In [79]:
features_list = ['poi', 'total_stock_value', 'expenses', 'exercised_stock_options', 'restricted_stock',  'from_messages','fraction_to_poi']
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [80]:
param_grid = {'max_depth':[2,3,4,5,6,7,8],
          'min_samples_split': [2,5,8,10,12,15,20],
          'min_samples_leaf':[1,2,5,8,10],
          }
cv = StratifiedShuffleSplit(labels, 100, random_state = 42)

clf_grid = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid,cv=cv,scoring='f1')

In [81]:
from time import time
t0=time()
clf_grid.fit(features,labels)
print "Fitting done in %0.3fs" % (time() - t0)


Fitting done in 73.858s

In [82]:
print "Best estimator found by grid search:"
print clf_grid.best_estimator_
clf=clf_grid.best_estimator_
test_classifier(clf, my_dataset, features_list, folds = 1000)
clf.feature_importances_


Best estimator found by grid search:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=8,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=8,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.83800	Precision: 0.44398	Recall: 0.53100	F1: 0.48361	F2: 0.51097
	Total predictions: 14000	True positives: 1062	False positives: 1330	False negatives:  938	True negatives: 10670

Out[82]:
array([ 0.        ,  0.41964445,  0.07851991,  0.        ,  0.        ,
        0.50183564])

DecisionTree for the important features except for the new created features


In [83]:
features_list = ['poi', 'expenses', 'exercised_stock_options','fraction_to_poi']
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [84]:
param_grid = {'max_depth':[2,3,4,5,6,7,8],
          'min_samples_split': [2,5,8,10,12,15,20],
          'min_samples_leaf':[1,2,5,8,10],
          }
cv = StratifiedShuffleSplit(labels, 100, random_state = 42)

clf_grid = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid,cv=cv,scoring='f1')

In [85]:
from time import time
t0=time()
clf_grid.fit(features,labels)
print "Fitting done in %0.3fs" % (time() - t0)


Fitting done in 76.130s

In [86]:
print "Best estimator found by grid search:"
print clf_grid.best_estimator_
clf=clf_grid.best_estimator_
test_classifier(clf, my_dataset, features_list, folds = 1000)
clf.feature_importances_


Best estimator found by grid search:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=8,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=8,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.83057	Precision: 0.42536	Recall: 0.53000	F1: 0.47195	F2: 0.50515
	Total predictions: 14000	True positives: 1060	False positives: 1432	False negatives:  940	True negatives: 10568

Out[86]:
array([ 0.49230607,  0.        ,  0.50769393])

MinMax Scaler


In [87]:
from sklearn.preprocessing import MinMaxScaler
features_list = ['poi', 'total_stock_value', 'expenses', 'exercised_stock_options', 'restricted_stock',  'from_messages','fraction_to_poi']
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
features = MinMaxScaler().fit_transform(features)

In [88]:
param_grid = {'max_depth':[2,3,4,5,6,7,8],
          'min_samples_split': [2,5,8,10,12,15,20],
          'min_samples_leaf':[1,2,5,8,10],
          }
cv = StratifiedShuffleSplit(labels, 100, random_state = 42)

clf_grid = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid,cv=cv,scoring='f1')

In [89]:
from time import time
t0=time()
clf_grid.fit(features,labels)
print "Fitting done in %0.3fs" % (time() - t0)


Fitting done in 67.031s

In [90]:
print "Best estimator found by grid search:"
print clf_grid.best_estimator_
clf=clf_grid.best_estimator_
test_classifier(clf, my_dataset, features_list, folds = 1000)
clf.feature_importances_


Best estimator found by grid search:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=8,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=8,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.83793	Precision: 0.44389	Recall: 0.53200	F1: 0.48397	F2: 0.51169
	Total predictions: 14000	True positives: 1064	False positives: 1333	False negatives:  936	True negatives: 10667

Out[90]:
array([ 0.        ,  0.41964445,  0.07851991,  0.        ,  0.        ,
        0.50183564])

DecisionTree for the important features except for the new created features with MinMax Scaler


In [91]:
features_list = ['poi', 'total_stock_value', 'expenses', 'exercised_stock_options', 'restricted_stock',  'from_messages']
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
features = MinMaxScaler().fit_transform(features)

In [92]:
param_grid = {'max_depth':[2,3,4,5,6,7,8],
          'min_samples_split': [2,5,8,10,12,15,20],
          'min_samples_leaf':[1,2,5,8,10],
          }
cv = StratifiedShuffleSplit(labels, 100, random_state = 42)

clf_grid = GridSearchCV(DecisionTreeClassifier(), param_grid=param_grid,cv=cv,scoring='f1')

In [93]:
from time import time
t0=time()
clf_grid.fit(features,labels)
print "Fitting done in %0.3fs" % (time() - t0)


Fitting done in 77.731s

In [94]:
print "Best estimator found by grid search:"
print clf_grid.best_estimator_
clf=clf_grid.best_estimator_
test_classifier(clf, my_dataset, features_list, folds = 1000)
clf.feature_importances_


Best estimator found by grid search:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=8,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.80250	Precision: 0.30554	Recall: 0.30050	F1: 0.30300	F2: 0.30149
	Total predictions: 14000	True positives:  601	False positives: 1366	False negatives: 1399	True negatives: 10634

Out[94]:
array([ 0.22307666,  0.33850749,  0.15116764,  0.25436905,  0.03287917])

naive_bayes


In [95]:
from sklearn.naive_bayes import GaussianNB
from email_preprocess import preprocess
features_list = ['poi', 'total_stock_value', 'expenses', 'exercised_stock_options', 'restricted_stock',  'from_messages','fraction_to_poi']
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
features = MinMaxScaler().fit_transform(features)
clf = GaussianNB()
features_train, features_test, labels_train, labels_test = preprocess()
clf.fit(features_train,labels_train)
test_classifier(clf, my_dataset, features_list, folds = 1000)


no. of Chris training emails: 7936
no. of Sara training emails: 7884
GaussianNB()
	Accuracy: 0.84757	Precision: 0.44472	Recall: 0.26950	F1: 0.33562	F2: 0.29255
	Total predictions: 14000	True positives:  539	False positives:  673	False negatives: 1461	True negatives: 11327